*** LIS Cross-section Data center in Luxembourg

* email: usersupport@lisdatacenter.org 

*** LIS Self Teaching Package 2022
*** Part II: Gender, employment, and wages
*** Stata version

* last change of this version of the syntax: 15-01-2022.

 
** Exercise 8: Pooled regressions and normalised weights

global varshh "hid own"
global varspp "hid dname pwgt ppopwgt relation partner ageyoch age sex immigr educ educ_c emp status1 ptime1 hwage1"
global datasets "us04 be04 gr04"

program define make_data
foreach ccyy in $datasets {
use $varspp using $`ccyy'p, clear
merge m:1 hid using $`ccyy'h, keepusing($varshh)
keep if inrange(age,25,54) & relation<=2200
if "`ccyy'" != "us04" {
append using ${mydata}exercise2_LIS
}
save ${mydata}exercise2_LIS, replace
}
end

program define recode_data
recode own (100/199=1) (200/299=0), gen(homeowner)
recode ageyoch (. 18/max= 0 "no children <18") (0/5 = 1 "<6 years") (6/17 = 2 "6-17 years"), gen(achildcat)
label var achildcat "Lowest age of own children"
gen hourwage = hwage1
replace hourwage=0 if hwage1<0
gen hourwage_log=log(hourwage) 
* keep negatives and 0 in the overall distribution of non-missing dhi 
replace hourwage_log=0 if hourwage_log==. & hourwage!=.  
foreach ccyy in $datasets {
sum hourwage_log [aw=ppopwgt] if dname=="`ccyy'", de
if "`ccyy'" == "us04" {
gen iqr=r(p75)-r(p25) if dname=="`ccyy'"
* detect upper bound for extreme values 
gen upper_bound=r(p75) + (iqr * 3) if dname=="`ccyy'"
gen lower_bound=r(p25) - (iqr * 3) if dname=="`ccyy'"
}
if "`ccyy'" != "us04" {
replace iqr=r(p75)-r(p25) if dname=="`ccyy'"
* detect upper bound for extreme values 
replace upper_bound=r(p75) + (iqr * 3) if dname=="`ccyy'"
replace lower_bound=r(p25) - (iqr * 3) if dname=="`ccyy'"
}
* top code income at upper bound for extreme values 
replace hourwage=exp(upper_bound) if hourwage>exp(upper_bound) & !mi(hourwage) & dname=="`ccyy'"
* bottom code income at lower bound for extreme values 
replace hourwage=exp(lower_bound) if hourwage<exp(lower_bound) & !mi(hourwage) & dname=="`ccyy'"
}
label values educ_c .
gen logwage = ln(hourwage)
gen agesq=age^2
recode achildcat (1=1) (0 2=0) (else=.), gen(youngchild)
recode achildcat (2=1) (0 1=0) (else=.), gen(oldchild)
recode educ (2=1) (1 3=0) (else=.), gen(mededuc)
recode educ (3=1) (1 2=0) (else=.), gen(hieduc)

gen ppp = .
replace ppp=0.86 if dname=="be04"
replace ppp=0.65 if dname=="gr04"
replace ppp=1 if dname=="us04"
gen hourwage_ppp = hourwage/ppp
gen logwage_ppp = log(hourwage_ppp)
gen belgium=0
replace belgium=1 if dname=="be04"
gen greece=0
replace greece=1 if dname=="gr04"
end

program define get_estimates
quietly eststo: regress logwage_ppp age agesq mededuc hieduc immigr partner youngchild oldchild ptime1 homeowner belgium greece [pw=ppopwgt] if sex==1, vce(robust)
quietly eststo: regress logwage_ppp age agesq mededuc hieduc immigr partner youngchild oldchild ptime1 homeowner belgium greece [pw=ppopwgt] if sex==2, vce(robust)
esttab, b(a2) se(a2) r2(a3) mtitles(Men Women)
end

quietly make_data
use ${mydata}exercise2_LIS, clear
quietly recode_data
get_estimates